Note: Open scripts.Rproj first, then script. To easily use relative paths, click the down button next to knit and then click “Knit Directory –> Project Directory”. This should make loading and saving files much easier.

Differential Gene Expression Analysis

Perform Differential Gene Expression (DGE) analysis using a read count table (usually from Salmon) and a sample metadata sheet (user made). Red text indicates regions that require the user to modify. Regardless, the user should check over all code blocks to ensure that everything is running correctly.

1. Load packages

Load packages

library(edgeR, quietly = TRUE) #edgeR-v3.30.3
library(vegan, quietly = TRUE)
library(Dune, quietly = TRUE)
library(ggplot2, quietly = TRUE) #ggplot2-v3.3.5
library(tidyverse, quietly = TRUE) #tidyverse-v1.3.1
library(ComplexHeatmap, quietly = TRUE)
library(DESeq2, quietly = TRUE)
library(genefilter, quietly = TRUE)

2. Set variables

Change file names and conditions where appropriate.

# Input unfiltered data
treatmentinfo.file <- "samples.tsv"
gcount.file <- "salmon.numreads.tsv"
# Output DEG results
DGE_results.file <- "salmon.numreads.DGE_results.tsv"
DGE_samples.file <- "salmon.numreads.DGE_samples.tsv"

# Treatment conditions and variables to consider
condition <- "treatment"
treatments.to.compare <- c("HTAC", "ATAC")
treatment.colors <- c("ATAC" = "#1b9e77", "HTAC" = "#d95f02")

# pOverA gene filtering thresholds (only process genes with over these cutoffs)
pOverA.cutoff.P <- 0.8 # >80% of samples
pOverA.cutoff.A <- 10  # >10 read counts

# DESeq2 results filtering thresholds (only report results below these cutoffs)
lfcThreshold.cutoff <- log2(1.5) # Log threshold to use for DESeq2 results filtering
padj.cutoff <- 0.05 # Adjusted p-value threshold to use for DESeq2 results filtering

3. Load, clean, and pre-processing datasets

Load the input file containing the treatment information.

treatmentinfo <- read.csv(treatmentinfo.file, header = TRUE, sep = "\t", fileEncoding="UTF-8-BOM") # Read in file
head(treatmentinfo)
##              sample_id unit lib_type         fq1         fq2 species plug_id
## 1 Pacuta_ATAC_TP1_1043    1       pe SRR14610932 SRR14610932  Pacuta    1043
## 2 Pacuta_ATAC_TP1_1775    1       pe SRR14610931 SRR14610931  Pacuta    1775
## 3 Pacuta_ATAC_TP1_2363    1       pe SRR14610930 SRR14610930  Pacuta    2363
## 4 Pacuta_ATAC_TP3_1041    1       pe SRR14610929 SRR14610929  Pacuta    1041
## 5 Pacuta_ATAC_TP3_1471    1       pe SRR14610928 SRR14610928  Pacuta    1471
## 6 Pacuta_ATAC_TP3_1637    1       pe SRR14610927 SRR14610927  Pacuta    1637
##   treatment timepoint            reef ploidy  group
## 1      ATAC         1 Lilipuna.Fringe      3 Group4
## 2      ATAC         1      Reef.42.43      2 Group5
## 3      ATAC         1         Reef.18      3 Group3
## 4      ATAC         3      Reef.11.13      3 Group2
## 5      ATAC         3      Reef.35.36      3 Group3
## 6      ATAC         3      Reef.11.13      3 Group2
# Check we have the right column names
headers <- c("sample_id", condition)
if( all(headers %in% colnames(treatmentinfo)) ){
  cat(paste(treatmentinfo.file, "has the required columns:", paste(headers, collapse=', ')))
} else {
  stop(paste(treatmentinfo.file, "is missing required columns:", paste(headers, collapse=', ')))
}
## samples.tsv has the required columns: sample_id, treatment

Filter treatmentinfo to just the samples that we want to analyze. Adjust filtering as required.

treatmentinfo <- treatmentinfo %>%
  filter(!!rlang::sym(condition) %in% treatments.to.compare & timepoint=="8") %>%
  mutate({{condition}} := factor(!!rlang::sym(condition), levels = treatments.to.compare))
head(treatmentinfo)
##              sample_id unit lib_type         fq1         fq2 species plug_id
## 1 Pacuta_ATAC_TP8_1051    1       pe SRR14610912 SRR14610912  Pacuta    1051
## 2 Pacuta_ATAC_TP8_1755    1       pe SRR14610911 SRR14610911  Pacuta    1755
## 3 Pacuta_ATAC_TP8_2012    1       pe SRR14610910 SRR14610910  Pacuta    2012
## 4 Pacuta_HTAC_TP8_1329    1       pe SRR14611019 SRR14611019  Pacuta    1329
## 5 Pacuta_HTAC_TP8_1765    1       pe SRR14611018 SRR14611018  Pacuta    1765
## 6 Pacuta_HTAC_TP8_2513    1       pe SRR14611017 SRR14611017  Pacuta    2513
##   treatment timepoint            reef ploidy  group
## 1      ATAC         8      Reef.11.13      3 Group1
## 2      ATAC         8      Reef.42.43      2 Group5
## 3      ATAC         8      Reef.42.43      3 Group2
## 4      HTAC         8 Lilipuna.Fringe      2 Group6
## 5      HTAC         8      Reef.42.43      3 Group3
## 6      HTAC         8         Reef.18      3 Group2

Load the input file containing the gene count matrix.

gcount <- as.data.frame(read.csv(gcount.file, header = TRUE, sep = "\t", fileEncoding="UTF-8-BOM")) # Read in file

# Check we have the right column names
headers <- c("Name", treatmentinfo$sample_id)
if( all(headers %in% colnames(gcount)) ){
  cat(paste(gcount.file, "has the required columns:", paste(headers, collapse=', ')))
} else {
  stop(paste(gcount.file, "is missing required columns:", paste(headers, collapse=', ')))
}
## salmon.numreads.tsv has the required columns: Name, Pacuta_ATAC_TP8_1051, Pacuta_ATAC_TP8_1755, Pacuta_ATAC_TP8_2012, Pacuta_HTAC_TP8_1329, Pacuta_HTAC_TP8_1765, Pacuta_HTAC_TP8_2513
# Cleanup gcount data
gcount <- gcount %>% 
  column_to_rownames("Name") %>% # Makes "Name" column rownames
  round() %>% # Round
  select(treatmentinfo$sample_id) # Select just columns in filtered treatmentinfo file

# View dataset attributes
d <- dim(gcount); print(paste("rows:",d[[1]]," columns:",d[[2]], sep=''))
## [1] "rows:33259 columns:6"
head(gcount)[,1:3]
##                                             Pacuta_ATAC_TP8_1051
## Pocillopora_acuta_KBHIv2___RNAseq.g16115.t1                   51
## Pocillopora_acuta_KBHIv2___TS.g16868.t1                      472
## Pocillopora_acuta_KBHIv2___RNAseq.g25384.t1                    1
## Pocillopora_acuta_KBHIv2___RNAseq.g30122.t1                    0
## Pocillopora_acuta_KBHIv2___RNAseq.g12817.t1                    0
## Pocillopora_acuta_KBHIv2___TS.g9097.t1                       140
##                                             Pacuta_ATAC_TP8_1755
## Pocillopora_acuta_KBHIv2___RNAseq.g16115.t1                   30
## Pocillopora_acuta_KBHIv2___TS.g16868.t1                      364
## Pocillopora_acuta_KBHIv2___RNAseq.g25384.t1                    0
## Pocillopora_acuta_KBHIv2___RNAseq.g30122.t1                    0
## Pocillopora_acuta_KBHIv2___RNAseq.g12817.t1                    1
## Pocillopora_acuta_KBHIv2___TS.g9097.t1                        24
##                                             Pacuta_ATAC_TP8_2012
## Pocillopora_acuta_KBHIv2___RNAseq.g16115.t1                   51
## Pocillopora_acuta_KBHIv2___TS.g16868.t1                      433
## Pocillopora_acuta_KBHIv2___RNAseq.g25384.t1                   14
## Pocillopora_acuta_KBHIv2___RNAseq.g30122.t1                    0
## Pocillopora_acuta_KBHIv2___RNAseq.g12817.t1                    0
## Pocillopora_acuta_KBHIv2___TS.g9097.t1                       127

Filter gcount to just genes with enough data across columns (exclude genes which are mostly zeros)

# Create filter for the counts data
filt <- filterfun(pOverA(pOverA.cutoff.P, pOverA.cutoff.A))
gfilt <- genefilter(gcount, filt)
# Identify genes to keep by count filter
keep <- gcount[gfilt,]
# Identify gene lists
keep <- rownames(keep)
# Gene count data filtered in PoverA, P percent of the samples have counts over A
gcount_filt <- as.data.frame(gcount[which(rownames(gcount) %in% keep),])
d <- dim(gcount);      print(paste("gcount      - rows:",d[[1]]," columns:",d[[2]], sep='')) # Before filtering
## [1] "gcount      - rows:33259 columns:6"
d <- dim(gcount_filt); print(paste("gcount_filt - rows:",d[[1]]," columns:",d[[2]], sep='')) # After filtering
## [1] "gcount_filt - rows:15583 columns:6"

Create a DESeqDataSet design from gene count matrix and labels. Here we set the design to look at the column listed in the condition variable.

gdds <- DESeqDataSetFromMatrix(countData = as.data.frame(gcount_filt),
                               colData = treatmentinfo,
                               design = as.formula(paste("~",condition, sep=''))
                               )
## converting counts to integer mode

4. Differential Gene Expression Analysis

Run DE analysis

Run differential expression test using a Wald model.

DEG <- DESeq(gdds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing

Explore significant p-values for condition pairs

# Extract DESeq2 results
DEG.results <- results(DEG, contrast= c(condition, treatments.to.compare), lfcThreshold=lfcThreshold.cutoff)
head(DEG.results)
## log2 fold change (MLE): treatment HTAC vs ATAC 
## Wald test p-value: treatment HTAC vs ATAC 
## DataFrame with 6 rows and 6 columns
##                                              baseMean log2FoldChange     lfcSE
##                                             <numeric>      <numeric> <numeric>
## Pocillopora_acuta_KBHIv2___RNAseq.g16115.t1   51.4931       0.426436  0.423765
## Pocillopora_acuta_KBHIv2___TS.g16868.t1      368.5623      -0.470140  0.343510
## Pocillopora_acuta_KBHIv2___TS.g9097.t1        81.9351      -0.520940  0.596254
## Pocillopora_acuta_KBHIv2___RNAseq.g17538.t1   29.0177       0.722860  0.627746
## Pocillopora_acuta_KBHIv2___RNAseq.g4713.t1   584.8922      -3.112965  0.798367
## Pocillopora_acuta_KBHIv2___RNAseq.g19105.t1   46.5179      -0.453354  0.390970
##                                                  stat     pvalue      padj
##                                             <numeric>  <numeric> <numeric>
## Pocillopora_acuta_KBHIv2___RNAseq.g16115.t1  0.000000 1.00000000  1.000000
## Pocillopora_acuta_KBHIv2___TS.g16868.t1      0.000000 1.00000000  1.000000
## Pocillopora_acuta_KBHIv2___TS.g9097.t1       0.000000 1.00000000  1.000000
## Pocillopora_acuta_KBHIv2___RNAseq.g17538.t1  0.219671 0.82612729        NA
## Pocillopora_acuta_KBHIv2___RNAseq.g4713.t1  -3.166467 0.00154303  0.364046
## Pocillopora_acuta_KBHIv2___RNAseq.g19105.t1  0.000000 1.00000000  1.000000
# Filter results
DEGs <- as.data.frame(subset(DEG.results, padj<padj.cutoff))
nrow(DEGs)
## [1] 25
# Order p-values by smallest value first
results.ordered <- order(DEGs$padj)

# Make row names a column before writing to file.
DEGs$gene_id  <- rownames(DEGs)
rownames(DEGs) <- NULL
DEGs <- relocate(DEGs, "gene_id", .before="baseMean")

# Write filtered DGE results and samples
write.table(DEGs, DGE_results.file, sep='\t', row.names=F, quote=F)
write.table(treatmentinfo, DGE_samples.file, sep='\t', row.names=F, quote=F)

We will now transform them with cpm for plotting, we will also subset the gene count matrix by the list of DEGs.

DEGlist <- gdds[DEGs$gene_id, ]

Apply a variance stabilizing transformation to minimize effects of small counts and normalize by library size

# Get size factor for each sample
sf <- estimateSizeFactors(gdds)$sizeFactor
print(paste("Max size factor: ",max(sf), sep=''))
## [1] "Max size factor: 1.85902608445539"
print(paste("Min size factor: ",min(sf), sep=''))
## [1] "Min size factor: 0.75064280748646"
# If above max(size factors) is less than 4, we can use VST.
Gvst <- vst(as.matrix(gcount_filt), blind=TRUE, fitType = "local")
## converting counts to integer mode
DEGvst <- Gvst[DEGs$gene_id,]
head(DEGvst) # View transformed gene count data
##                                             Pacuta_ATAC_TP8_1051
## Pocillopora_acuta_KBHIv2___RNAseq.g7091.t1              7.393249
## Pocillopora_acuta_KBHIv2___RNAseq.g9680.t1             14.109410
## Pocillopora_acuta_KBHIv2___RNAseq.g24677.t1            10.275293
## Pocillopora_acuta_KBHIv2___TS.g3591.t1                 10.438549
## Pocillopora_acuta_KBHIv2___RNAseq.g5755.t1             10.918501
## Pocillopora_acuta_KBHIv2___RNAseq.g29114.t2             8.270558
##                                             Pacuta_ATAC_TP8_1755
## Pocillopora_acuta_KBHIv2___RNAseq.g7091.t1              6.738974
## Pocillopora_acuta_KBHIv2___RNAseq.g9680.t1             13.139975
## Pocillopora_acuta_KBHIv2___RNAseq.g24677.t1             8.237673
## Pocillopora_acuta_KBHIv2___TS.g3591.t1                  9.244169
## Pocillopora_acuta_KBHIv2___RNAseq.g5755.t1              9.565123
## Pocillopora_acuta_KBHIv2___RNAseq.g29114.t2             8.257337
##                                             Pacuta_ATAC_TP8_2012
## Pocillopora_acuta_KBHIv2___RNAseq.g7091.t1              7.422574
## Pocillopora_acuta_KBHIv2___RNAseq.g9680.t1             12.196555
## Pocillopora_acuta_KBHIv2___RNAseq.g24677.t1             8.541758
## Pocillopora_acuta_KBHIv2___TS.g3591.t1                  9.962511
## Pocillopora_acuta_KBHIv2___RNAseq.g5755.t1             10.644623
## Pocillopora_acuta_KBHIv2___RNAseq.g29114.t2             8.926995
##                                             Pacuta_HTAC_TP8_1329
## Pocillopora_acuta_KBHIv2___RNAseq.g7091.t1              5.988370
## Pocillopora_acuta_KBHIv2___RNAseq.g9680.t1              8.136902
## Pocillopora_acuta_KBHIv2___RNAseq.g24677.t1             7.076576
## Pocillopora_acuta_KBHIv2___TS.g3591.t1                  7.244905
## Pocillopora_acuta_KBHIv2___RNAseq.g5755.t1              7.730138
## Pocillopora_acuta_KBHIv2___RNAseq.g29114.t2             6.349941
##                                             Pacuta_HTAC_TP8_1765
## Pocillopora_acuta_KBHIv2___RNAseq.g7091.t1              5.741367
## Pocillopora_acuta_KBHIv2___RNAseq.g9680.t1             10.623002
## Pocillopora_acuta_KBHIv2___RNAseq.g24677.t1             6.879131
## Pocillopora_acuta_KBHIv2___TS.g3591.t1                  8.079371
## Pocillopora_acuta_KBHIv2___RNAseq.g5755.t1              8.544964
## Pocillopora_acuta_KBHIv2___RNAseq.g29114.t2             7.015737
##                                             Pacuta_HTAC_TP8_2513
## Pocillopora_acuta_KBHIv2___RNAseq.g7091.t1              5.654871
## Pocillopora_acuta_KBHIv2___RNAseq.g9680.t1              9.702286
## Pocillopora_acuta_KBHIv2___RNAseq.g24677.t1             6.444920
## Pocillopora_acuta_KBHIv2___TS.g3591.t1                  6.644206
## Pocillopora_acuta_KBHIv2___RNAseq.g5755.t1              8.425329
## Pocillopora_acuta_KBHIv2___RNAseq.g29114.t2             6.100439
dim(DEGvst)
## [1] 25  6

Make a matrix for computing similarity

mat <- DEGvst # Make an expression object
mat <- mat - rowMeans(mat) # Difference in expression compared to average across all samples

5. Plot results

DEG heatmap

Make a heatmap

hmTreatment <- subset(treatmentinfo, select=c(condition))
col <- list(); col[[condition]] = treatment.colors # Setup list for heatmap colors
hm_ann_col <- HeatmapAnnotation(df=hmTreatment, col = col) # Make dataframe for column naming
dend = cluster_within_group(mat, hmTreatment[[condition]])

DEGheatmap <-  Heatmap(mat,
          cluster_columns = dend,
          column_split = 2,
          name = "Gene expression (vst)",
          show_row_names = F,
          top_annotation = hm_ann_col,
          show_column_names = F,
          row_dend_side = "left",
          column_dend_height = unit(0.5, "in"),
          row_title_side = "right",
          row_title_rot = 0,
          row_dend_reorder = TRUE,
          row_gap = unit(2.5, "mm"),
          border = TRUE,
          column_names_gp =  gpar(fontsize = 10)
        )

draw(DEGheatmap)

Principle components plot of DEGs

pca <- prcomp(t(mat)) # Calculate eigengenes
percentVar <- pca$sdev^2/sum(pca$sdev^2) # Save % variation by PC1 and PC2
d <- data.frame(treatmentinfo, PC1 = pca$x[, 1], PC2 = pca$x[, 2])
DEG_PCA <- ggplot(data = d, aes_string(x = "PC1", y = "PC2")) +
  geom_point(size = 4, aes(colour=!!rlang::sym(condition))) +
  xlab(paste0("PC1: ", round(percentVar[1] * 100), "% variance")) +
  ylab(paste0("PC2: ", round(percentVar[2] * 100), "% variance")) +
  coord_fixed() +
  scale_color_manual(values = treatment.colors) +
  theme_bw() + # Set background color
  theme(panel.border = element_blank(), # Set border
        panel.grid.major = element_blank(), # Set major gridlines
        panel.grid.minor = element_blank(), # Set minor gridlines
        axis.line = element_line(colour = "black", size = 0.6), # Set axes color
        plot.background=element_blank(), # Set the plot background
        axis.title = element_text(size = 14), # Axis title size
        axis.text = element_blank()) # Axis text size and view plot
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
DEG_PCA